In [1]:
import pandas as pd
import numpy as np
store = pd.read_csv('store.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
print "Lets look at training set-"
print train.head(6)
print "Lets look at stores data-" 
print train.head(6)
print "Given number of rows for training =", train.shape[0]
print "Given number of rows for testing =", test.shape[0]


Lets look at training set-
   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0    294          2  2013-07-23   4365        626     1      0            0   
1   1068          5  2015-05-08   5234        369     1      1            0   
2    596          1  2013-06-10   3505        594     1      0            0   
3    210          5  2015-03-27   2864        285     1      0            0   
4    885          4  2014-04-03   5333        690     1      1            0   
5    222          5  2014-01-24   5180        600     1      1            0   

   SchoolHoliday  
0              1  
1              0  
2              0  
3              1  
4              0  
5              0  
Lets look at stores data-
   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0    294          2  2013-07-23   4365        626     1      0            0   
1   1068          5  2015-05-08   5234        369     1      1            0   
2    596          1  2013-06-10   3505        594     1      0            0   
3    210          5  2015-03-27   2864        285     1      0            0   
4    885          4  2014-04-03   5333        690     1      1            0   
5    222          5  2014-01-24   5180        600     1      1            0   

   SchoolHoliday  
0              1  
1              0  
2              0  
3              1  
4              0  
5              0  
Given number of rows for training = 605239
Given number of rows for testing = 106807

In [3]:
sf = pd.merge(train, store, on='Store') #merge data for analysis
print "List of columns -", sf.columns
sf['Open'] = sf['Open'].apply(lambda x: 0 if np.isnan(x) else x) #get rid of NaN values if store is 'closed'


List of columns - Index([u'Store', u'DayOfWeek', u'Date', u'Sales', u'Customers', u'Open',
       u'Promo', u'StateHoliday', u'SchoolHoliday', u'StoreType',
       u'Assortment', u'CompetitionDistance', u'CompetitionOpenSinceMonth',
       u'CompetitionOpenSinceYear', u'Promo2', u'Promo2SinceWeek',
       u'Promo2SinceYear', u'PromoInterval'],
      dtype='object')

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
#a better way to get more details, timeline view of medians of each day of the week-
day = sf[(sf['Open']!=0)]
sales_day = day.groupby('DayOfWeek')['Sales'].median()
cust_day = day.groupby('DayOfWeek')['Customers'].median()
#
fig, (axis1) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# plot median sales
ax1 = sales_day.plot(legend=True, ax=axis1, marker='o',title="Fig. 1, Median sales of each day of the week")
ax1.set_xticks(sales_day.index)
tmp = ax1.set_xticklabels(sales_day.index.tolist(), rotation=90)
# overlay customer data
cust_day.plot(legend=True, ax=axis1, marker='x', secondary_y=True)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e22c27610>

In [6]:
#for more insights lets split Year-Month-Date to three different columns
def date_change(data):
    data['Month'] = data['Date'].apply(lambda x : int(str(x)[5:7]))
    data['Year'] = data['Date'].apply(lambda x : int(str(x)[:4]))
    data['MonthYear'] = data['Date'].apply(lambda x : (str(x)[:7]))
    data['date_int'] = data['Date'].apply(lambda x : (str(x)[8:10]))
    data = data.drop('Date', axis=1)
    return data
sf = date_change(sf)
test = date_change(test)

In [7]:
day = sf[(sf['Open']!=0)]
sales_day = day.groupby('date_int')['Sales'].median()
cust_day = day.groupby('date_int')['Customers'].median()
#
fig, (axis1) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# plot median sales
ax1 = sales_day.plot(legend=True, ax=axis1, marker='o',title="Fig. 2: Sales vs Dates")
ax1.set_xticks(sales_day.index)
#ax1.set_yticks(date_int)
tmp = ax1.set_xticklabels(sales_day.index.tolist(), rotation=90)
# overlay customer data
cust_day.plot(legend=True, ax=axis1, marker='x', secondary_y=True)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e0db9fa50>

In [8]:
import calendar
# select all stores that were open
subs = sf[sf['Open']!=0]
# groupby Year and Month
selected_sales = subs.groupby(['Year', 'Month'])['Sales'].mean()
selected_cust = subs.groupby(['Year', 'Month'])['Customers'].mean()
# plot
fig, (axis1) = plt.subplots(1,1, figsize=(10,7))
selected_sales.unstack().T.plot(ax=axis1)
tmp = axis1.set_title("Fig. 3: Yearly Sales")
tmp = axis1.set_ylabel("Sales")
tmp = axis1.set_xticks(range(0,13))
tmp = axis1.set_xticklabels(calendar.month_abbr)



In [10]:
# median sales
median_sales = sf.groupby('MonthYear')['Sales'].median()
pct_median_change = sf.groupby('MonthYear')['Sales'].median().pct_change()
# median customers
median_cust = sf.groupby('MonthYear')['Customers'].median()
pct_median_custchange = sf.groupby('MonthYear')['Customers'].median().pct_change()

fig, (axis1, axis2) = plt.subplots(2, 1, sharex=True, figsize=(15,15))
# plot median sales
ax1 = median_sales.plot(legend=True, ax=axis1, marker='o',title="Fig. 4a: Monthly changes in Sales/Customers")
ax1.set_xticks(range(len(median_sales)))
ax1.set_xticklabels(median_sales.index.tolist(), rotation=90)
# plot pct change
ax2 = pct_median_change.plot(legend=True, ax=axis2, marker='o',rot=90, title="Fig. 4b: Percent Change")
# overlay customer data
median_cust.plot(legend=True, ax=axis1, marker='x', secondary_y=True)
pct_median_custchange.plot(legend=True, ax=axis2, marker='x', rot=90, secondary_y=True)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e0d529550>

In [11]:
fig, (axis3) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# stateholiday overlay
# StateHoliday has a mixture of "O", 0 and "0", replace these with "O"
sf.StateHoliday.replace(["O",0,"0"],['O','O','O'], inplace=True)
ax3 = sns.countplot(x='MonthYear', hue='StateHoliday', data=sf[sf['StateHoliday']!='O'], palette='husl', ax=axis3)
tmp = ax3.set_xticklabels(ax3.get_xticklabels(), rotation=90)
tmp = ax3.set_title('Fig. 5: State Holidays')



In [12]:
fig, (axis4) = plt.subplots(1,1, sharex=True, figsize=(10,5))
ax4 = sns.countplot(x='MonthYear', hue='SchoolHoliday', data=sf[sf['SchoolHoliday']!=0], palette='husl', ax=axis4)
subs = sf[sf['SchoolHoliday']!=0]
#cntchange = subs.groupby('MonthYear')['SchoolHoliday'].count().pct_change()
#cntchange.plot(ax=axis4, marker='x', secondary_y=True)
tmp = ax4.set_xticklabels(ax4.get_xticklabels(), rotation=90)
tmp = ax4.set_title('Fig.6: School Holidays and Percentage Change in School Holidays')



In [13]:
fig, (axis5) = plt.subplots(1,1, sharex=True, figsize=(10,5))
#promotions overlay
ax5 = sns.countplot(x='MonthYear', hue='Promo', data=sf[sf["Promo"]!=0], palette='husl', ax=axis5)
subs = sf[sf['Promo']!=0]
cntchange = subs.groupby('MonthYear')['Promo'].count().pct_change()
cntchange.plot(ax=axis5, marker='x', secondary_y=True)
tmp = ax5.set_xticklabels(ax5.get_xticklabels(), rotation=90)
tmp = ax5.set_title('Fig. 7: Promotions and Percentage Change in Promotions')



In [ ]: